In [54]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [55]:
#importing data, spray data for the test set (2008,2010,2012,2014) is not provided. Therefore, spray info is not used for the analysis
train = pd.read_csv('./assets/train.csv')
test =pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray =pd.read_csv('./assets/spray.csv')
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
In [56]:
spray.head()
Out[56]:
In [57]:
train[['Date','Trap','Species','Latitude','Longitude','NumMosquitos','WnvPresent']].head()
Out[57]:
In [58]:
weather.head()
Out[58]:
In [59]:
train.Trap.unique()
Out[59]:
In [61]:
# Number of mosquitos in traps with or without wnv
ax = sns.boxplot( y="NumMosquitos",x="WnvPresent", data=train,showfliers=False)
plt.show()
In [10]:
# Given the number of mosquitos in a trap, what is the probability of finding the virus in that trap!
number_of_mosquito_probability = train.groupby(pd.cut(train['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
In [103]:
ax = sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
ax.set(xlabel="Probability of West Nile Virus", ylabel="Number of Mosquitoes")
plt.show()
In [63]:
# effect of mosquito type on the presence of wnv
mosquito_type = train.groupby(train['Species'], as_index = False).WnvPresent.sum()
ax = sns.barplot(x="WnvPresent", y="Species", data=mosquito_type)
ax.set(xlabel='Number of Traps')
plt.show()
In [64]:
spec1 = train[train['Species'] =='CULEX RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()
In [65]:
train.Species.unique()
Out[65]:
In [66]:
spec1 = train[train['Species'] =='CULEX PIPIENS/RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()
In [67]:
spec1 = train[train['Species'] =='CULEX PIPIENS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()
In [68]:
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline
from sklearn.neighbors import KernelDensity
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = train[['Date', 'Trap','Longitude', 'Latitude', 'WnvPresent']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)
sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Date', 'Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)
xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata,
cmap=plt.get_cmap('gray'),
extent=lon_lat_box,
aspect=aspect)
plt.imshow(zv,
origin='lower',
cmap=alpha_cm,
extent=lon_lat_box,
aspect=aspect)
locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.show()
#plt.savefig('heatmap.png')
In [69]:
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline
from sklearn.neighbors import KernelDensity
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = spray[['Date','Longitude', 'Latitude']]
traps1 = train[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)
#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)
xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata,
cmap=plt.get_cmap('gray'),
extent=lon_lat_box,
aspect=aspect)
#plt.imshow(zv,
# origin='lower',
# cmap=alpha_cm,
# extent=lon_lat_box,
# aspect=aspect)
locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')
In [70]:
from sklearn.neighbors import KernelDensity
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = workingdata[['Longitude', 'Latitude']]
traps1 = data[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)
#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)
xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata,
cmap=plt.get_cmap('gray'),
extent=lon_lat_box,
aspect=aspect)
#plt.imshow(zv,
# origin='lower',
# cmap=alpha_cm,
# extent=lon_lat_box,
# aspect=aspect)
locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')
In [71]:
trap = train[['Longitude', 'Latitude']].drop_duplicates()
In [72]:
workingdata= spray[['Longitude', 'Latitude']]
longi= np.round(workingdata.Longitude, 2)
lati =np.round(workingdata.Latitude, 2)
p=[]
g =[]
for i in range(len(longi)):
p.append(float("{0:.2f}".format(longi[i])))# formatting to show only 4 decimals
g.append(float("{0:.2f}".format(lati[i])))
longi =p
lati= g
In [73]:
trap = train[['Longitude', 'Latitude']]
trap =trap.drop_duplicates()
trap_longi= np.round(trap.Longitude, 2)
trap_lati =np.round(trap.Latitude, 2)
trap_longi =list(trap_longi)# from series to lists
trap_lati =list(trap_lati)
p=[]
g =[]
for i in range(len(trap_lati)):
p.append(float("{0:.2f}".format(trap_longi[i])))
g.append(float("{0:.2f}".format(trap_lati[i])))
trap_longi =p
trap_lati= g
In [74]:
f=[]
for i in range(len(trap_longi)):
addi = 0.0
for j in range(len(longi)):
if (trap_longi[i] == longi[j]) and (trap_lati[i] == lati[j]):
addi =addi +1
else:
addi =addi +0
f.append(addi)
In [75]:
len(longi)
Out[75]:
In [76]:
print len(f)
sum(f)
Out[76]:
In [77]:
trap['sprayed'] = f
In [78]:
data =trap[trap['sprayed'] > 1.0]
In [79]:
#EDA
print(train.shape)
print(train.head())
print(train.describe(include ='all'))
print(train.isnull().sum())
print(train.dtypes)
In [80]:
#EDA
#M = missing data
#T = trace precipitation
print(weather.shape)
print(weather.head())
print(weather.describe(include ='all'))
print(weather.isnull().sum())
print(weather.dtypes)
In [81]:
weather.columns
Out[81]:
In [82]:
#only working with the data from Station No. 1 and using Tavg, total precipitation, and wind speed as variables
weather = weather[weather['Station']== 1]
weather = weather[['Date','Tavg','PrecipTotal','ResultSpeed']]
In [83]:
# Adding the weather data to the train and test data
train_add =train.join(weather.set_index('Date'), on='Date')
test_add = test.join(weather.set_index('Date'), on='Date')
In [84]:
# reformatting Tavg to float
train_add['Tavg']=train_add.Tavg.astype(float)
# assigning 0 instead of T and reformatting PrecipTotal to float
train_add['PrecipTotal']= np.where(train_add['PrecipTotal']==' T', '0.0', train_add['PrecipTotal'])
train_add['PrecipTotal']= train_add.PrecipTotal.astype(float)
In [100]:
# Tavg in traps with or without wnv
ax = sns.boxplot( y="Tavg",x="WnvPresent", data=train_add,showfliers=False)
ax.set(xlabel="Average Temperature", ylabel="West Nile Virus Present")
plt.show()
In [98]:
ax = sns.boxplot( y="PrecipTotal",x="WnvPresent", data=train_add, showfliers=False)
ax.set(xlabel="Precipitation Total", ylabel="West Nile Virus Present")
plt.show()
In [150]:
ax = sn.boxplot( y="ResultSpeed",x="WnvPresent", data=train_add)
plt.show()
In [248]:
from datetime import datetime
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y-%m-%d')
In [249]:
train['week'] = train['Date'].dt.weekofyear
train['year'] = train['Date'].dt.year
test['week'] = test['Date'].dt.weekofyear
test['year'] = test['Date'].dt.year
spray['year']=spray['Date'].dt.year
weather['year']=weather['Date'].dt.year
In [78]:
print test.columns
train.columns
Out[78]:
In [6]:
weeks = train.groupby(['year','week'], as_index=False).WnvPresent.sum()
In [80]:
test.head()
Out[80]:
In [71]:
weeks.dtypes
Out[71]:
In [79]:
import seaborn as sns
#weeks['Datetime'] = df.index
ax=sns.lmplot(data = weeks, x='week', y='WnvPresent',hue='year', fit_reg=False, palette="Set1",markers= '_' )
#plt.plot( weeks['week'], weeks['WnvPresent'] , color = weeks['year'])
plt.show()
In [45]:
weeks['index'] =weeks.index
weeks.head()
Out[45]:
In [307]:
weeks.year.unique()
Out[307]:
In [309]:
fig = plt.figure(figsize=(7,4))
ax = fig.gca()
#ax = sns.distplot(ax=ax)
#proposal_xrange = np.linspace(np.min(data)-2, np.max(data)+2, 200)
ax.plot(weeks[weeks['year']== 2007].week, weeks[weeks['year']== 2007].WnvPresent , lw=2, color='darkred', alpha=0.1,
label='2007')
ax.plot(weeks[weeks['year']== 2009].week, weeks[weeks['year']== 2009].WnvPresent , lw=2, color='darkred', alpha=0.3,
label='2009')
ax.plot(weeks[weeks['year']== 2011].week, weeks[weeks['year']== 2011].WnvPresent , lw=2, color='darkred', alpha=0.5,
label='2011 ')
ax.plot(weeks[weeks['year']== 2013].week, weeks[weeks['year']== 2013].WnvPresent , lw=2, color='darkred', alpha=1.0,
label='2013 ')
ax.legend(loc='upper left')
plt.show()
In [312]:
w30 =pd.read_csv('weather_ave_30.csv')
In [313]:
w30.head()
Out[313]:
In [ ]:
In [ ]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score, StratifiedKFold ,train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
In [ ]:
# model evaluation function
def evaluate_model(model):
model.fit(X, y)
y_pred = model.predict(X_test)
a = accuracy_score(y_test, y_pred)
probabilities = model.predict_proba(X_test)
#cm = confusion_matrix(y_test, y_pred)
conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['1', '0'],
columns=['predicted_1','predicted_0'])
cr = classification_report(y_test, y_pred)
print 'confusion matrix:',confusion
print 'classification_report:',cr
print 'Accuracy of the model on test:',a
return probabilities
In [ ]:
X=train['week','year',]
In [ ]:
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}
gsrf = GridSearchCV(RandomForestClassifier(random_state = 33),
params, n_jobs=-1,
cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsrf.fit(X, y)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)
#print 'model score on test:',score
In [ ]:
# Grid search over KNN parameters
params = {'n_neighbors': range(2,60)}
gsknn = GridSearchCV(KNeighborsClassifier(),
params, n_jobs=-1,
cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsknn.fit(X, y)
print 'best parameters for the model:',gsknn.best_params_
print 'best score on train:',gsknn.best_score_
probability = evaluate_model(gsknn.best_estimator_)
#print 'model score on test:',score
In [ ]:
# Grid search over logestic regression parameters
params = {'penalty': ['l2','l1'], 'C':[0.1,1.0,10]}
gslr = GridSearchCV(LogisticRegression(solver = 'sag',random_state = 33),
params, n_jobs=-1,
cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gslr.fit(X, y)
print 'best parameters for the model:',gslr.best_params_
print 'best score on train:',gslr.best_score_
probability = evaluate_model(gslr.best_estimator_)
#print 'model score on test:',score
In [ ]:
scaled_X =StandardScaler(X)
scaled_X_test =StandardScaler(X_test)
# Grid search over support vector machine parameters
params = { 'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
'gamma': ['auto', 0.1, 1.0, 10.0],
'kernel': ['linear', 'rbf']}
gssvc = GridSearchCV(SVC(random_state = 33),
params, n_jobs=-1,
cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gssvc.fit(scaled_X, y)
print 'best parameters for the model:',gssvc.best_params_
print 'best score on train:',gssvc.best_score_
probability = evaluate_model(gssvc.best_estimator_)#!!!!!!! check to used scaled vectors!
#print 'model score on test:',score
In [ ]:
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}
gsgb = GridSearchCV(GradientBoostingClassifier(random_state = 33),
params, n_jobs=-1,
cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsgb.fit(X, y)
print 'best parameters for the model:',gsgb.best_params_
print 'best score on train:',gsgb.best_score_
probability = evaluate_model(gsgb.best_estimator_)
#print 'model score on test:',score
In [ ]:
### cross val score???
In [ ]:
#AUC
y_score=[]
for i in range(0,len(probability)):
y_score.append(probability[i][1])
FPR = dict()
TPR = dict()
ROC_AUC = dict()
# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(y_test, y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])
# Plot of a ROC curve for class 1
plt.figure(figsize=[5,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver operating characteristic for salary ', fontsize=12)
plt.legend(loc="lower right")
plt.savefig('ROC_AUC.png')
plt.show()